My assigment 9

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(easystats)
## # Attaching packages: easystats 0.5.2 (red = needs update)
## ✖ insight     0.18.4    ✖ datawizard  0.6.2  
## ✔ bayestestR  0.13.0    ✔ performance 0.10.0 
## ✔ parameters  0.19.0    ✖ effectsize  0.7.0.5
## ✔ modelbased  0.8.5     ✖ correlation 0.8.2  
## ✔ see         0.7.3     ✔ report      0.5.5  
## 
## Restart the R-Session and update packages in red with `easystats::easystats_update()`.
library(dplyr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(DT)
library(modelr)
## 
## Attaching package: 'modelr'
## 
## The following objects are masked from 'package:performance':
## 
##     mae, mse, rmse

df<-read_csv("./data/GradSchool_Admissions.csv") 
## Rows: 400 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): admit, gre, gpa, rank
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Here is a glimpse of our dataset

df %>% head() %>% kable() %>% kable_classic(lightable_options = 'hover')
admit gre gpa rank
0 380 3.61 3
1 660 3.67 3
1 800 4.00 1
1 640 3.19 4
0 520 2.93 4
1 760 3.00 2

Let’s take a look at the relationships in this data

datfram<-df%>% mutate(admitancetf=case_when(admit==1~TRUE,
                        TRUE~FALSE))

p1<-datfram %>% ggplot(aes(x=gre,y=gpa,color=admitancetf))+
  geom_smooth(se=FALSE)+
  theme_minimal()
plotly::ggplotly(p1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p<-datfram %>% 
  ggplot(aes(x=gre,y=rank,color=admitancetf))+
  geom_density2d()+
  theme_minimal()
plotly::ggplotly(p)

Here is an interactive table that you can filter for attributes and see individuals that fit that criteria

datatable(datfram, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T) )

Now let’s begin to model the data

First let’s look how at the variables interact with eachother from 3,000 feet

GGally::ggpairs(datfram)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Let’s make some models to attempt to explain the data

mod1<-glm(data=datfram,
          formula = admitancetf ~ gre,
          family = "binomial")
mod2<-glm(data=datfram,
          formula = admitancetf ~ gre + gpa,
          family = "binomial")
mod3<-glm(data=datfram,
          formula = admitancetf ~ gre + gpa + rank,
          family = "binomial")
mod4<-glm(data=datfram,
          formula = admitancetf ~ gre * gpa,
          family = "binomial")
mod5<-glm(data=datfram,
          formula = admitancetf ~ gre * gpa + rank,
          family = "binomial")
compare_performance(mod1,mod2,mod3,mod4,mod5) %>% plot()

compare_performance(mod1,mod2,mod3,mod4,mod5,rank=TRUE)
## # Comparison of Model Performance Indices
## 
## Name | Model | Tjur's R2 |  RMSE | Sigma | Log_loss | Score_log | Score_spherical |   PCP | AIC weights | BIC weights | Performance-Score
## -----------------------------------------------------------------------------------------------------------------------------------------
## mod5 |   glm |     0.105 | 0.440 | 1.075 |    0.571 |   -50.481 |           0.004 | 0.612 |       0.604 |       0.172 |            65.59%
## mod3 |   glm |     0.099 | 0.442 | 1.077 |    0.574 |   -50.496 |           0.004 | 0.609 |       0.396 |       0.827 |            64.60%
## mod4 |   glm |     0.052 | 0.453 | 1.098 |    0.597 |   -48.633 |           0.004 | 0.589 |    3.99e-05 |    8.33e-05 |            39.47%
## mod2 |   glm |     0.047 | 0.455 | 1.100 |    0.600 |   -48.679 |           0.004 | 0.587 |    3.11e-05 |    4.78e-04 |            34.95%
## mod1 |   glm |     0.034 | 0.458 | 1.105 |    0.608 |   -48.247 |           0.003 | 0.581 |    4.86e-06 |    5.50e-04 |            22.23%
add_predictions(datfram,mod5,type = "response") %>% 
  ggplot(aes(x=gre,y=pred))+
  geom_point(color="black")+
  geom_point(aes(y=admit,x=gre),alpha=.5,color="red")

# I am not sure how to make this shitty binomial data look better. 
# I think the predictions are the chance that they will get in.